Lab 6

library(ggthemes)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.7     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.1     ✔ yardstick    1.3.2
✔ recipes      1.2.1     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(dplyr)
root  <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)
# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 
camels <- power_full_join(camels ,by = 'gauge_id')

Question 1

ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

scale_color_manual(values = c("purple", "magenta", "lightpink")) #lets you pick your own colors.
<ggproto object: Class ScaleDiscrete, Scale, gg>
    aesthetics: colour
    axis_order: function
    break_info: function
    break_positions: function
    breaks: waiver
    call: call
    clone: function
    dimension: function
    drop: TRUE
    expand: waiver
    get_breaks: function
    get_breaks_minor: function
    get_labels: function
    get_limits: function
    get_transformation: function
    guide: legend
    is_discrete: function
    is_empty: function
    labels: waiver
    limits: NULL
    make_sec_title: function
    make_title: function
    map: function
    map_df: function
    n.breaks.cache: NULL
    na.translate: TRUE
    na.value: grey50
    name: waiver
    palette: function
    palette.cache: NULL
    position: left
    range: environment
    rescale: function
    reset: function
    train: function
    train_df: function
    transform: function
    transform_df: function
    super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

zero_q_freq: frequency of days with Q = 0 mm/day (it’s reported in a percentage)

Question 2

library(ggplot2)
library(ggthemes)
library(maps)

Attaching package: 'maps'
The following object is masked from 'package:purrr':

    map
map_aridity <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +  
  geom_point(aes(color = aridity)) +    
  scale_color_viridis_c() +  
  ggthemes::theme_map() +               
  ggtitle("Site Locations Colored by Aridity")

print(map_aridity)  

map_p_mean <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +  
  geom_point(aes(color = p_mean)) +    
  scale_color_gradient(low = "pink", high = "dodgerblue") +  
  ggthemes::theme_map() +               
  ggtitle("Site Locations Colored by Mean Precipitation (p_mean)")

print(map_p_mean)  

library(patchwork)
combined_plot <- map_aridity / map_p_mean
combined_plot2 <- map_aridity + map_p_mean
print(combined_plot)  

print(combined_plot2)

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())
# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

correct version

library(recipes)
test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)
library(yardstick)
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
library(ggplot2)
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

I just realized I don’t think we need to put in all the code given :)

Question 3

library(tidymodels)   # For modeling
library(xgboost)      # For XGBoost engine

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice
library(tidyverse)    # For data wrangling
xgboost_spec <- boost_tree(mode = "regression") %>%
  set_engine("xgboost")

xgboost_wf <- workflow() %>%
  add_model(xgboost_spec) %>%
  add_recipe(rec)

xgboost_fit <- fit(xgboost_wf, data = camels_train)

test_data$xgb_pred <- predict(xgboost_fit, new_data = test_data)$.pred
Warning in bake.step_log(step, new_data = new_data): NaNs produced
nnet_spec <- bag_mlp(mode = "regression") %>%
  set_engine("nnet")

nnet_wf <- workflow() %>%
  add_model(nnet_spec) %>%
  add_recipe(rec)

nnet_fit <- fit(nnet_wf, data = camels_train)

test_data$nnet_pred <- predict(nnet_fit, new_data = test_data)$.pred
Warning in bake.step_log(step, new_data = new_data): NaNs produced
rf_spec <- rand_forest(mode = "regression") %>%
  set_engine("ranger")

rf_wf <- workflow() %>%
  add_model(rf_spec) %>%
  add_recipe(rec)

rf_fit <- fit(rf_wf, data = camels_train)
test_data$rf_pred <- predict(rf_fit, new_data = test_data)$.pred
Warning in bake.step_log(step, new_data = new_data): NaNs produced
model_metrics <- bind_rows(
  metrics(test_data, truth = logQmean, estimate = lm_pred) %>% mutate(model = "Linear Regression"),
  metrics(test_data, truth = logQmean, estimate = rf_pred) %>% mutate(model = "Random Forest"),
  metrics(test_data, truth = logQmean, estimate = xgb_pred) %>% mutate(model = "XGBoost"),
  metrics(test_data, truth = logQmean, estimate = nnet_pred) %>% mutate(model = "Neural Network")
)

print(model_metrics)
# A tibble: 12 × 4
   .metric .estimator .estimate model            
   <chr>   <chr>          <dbl> <chr>            
 1 rmse    standard       0.583 Linear Regression
 2 rsq     standard       0.742 Linear Regression
 3 mae     standard       0.390 Linear Regression
 4 rmse    standard       0.792 Random Forest    
 5 rsq     standard       0.529 Random Forest    
 6 mae     standard       0.590 Random Forest    
 7 rmse    standard       2.16  XGBoost          
 8 rsq     standard       0.454 XGBoost          
 9 mae     standard       2.02  XGBoost          
10 rmse    standard       7.25  Neural Network   
11 rsq     standard       0.289 Neural Network   
12 mae     standard       7.18  Neural Network   

Linear regression seems like it would be the best for this!

Build your own

data splitting

set.seed(123)  

data_split <- initial_split(camels, prop = 0.75, strata = q_mean)
camels_train <- training(data_split)
camels_test <- testing(data_split)

cv_folds <- vfold_cv(camels_train, v = 10, strata = q_mean)

dim(camels_train)  
[1] 503  59
dim(camels_test)   
[1] 168  59

recipe

library(recipes)

rec <- recipe(q_mean ~ p_mean + aridity + elev_mean + slope_mean + 
                 geol_permeability + baseflow_index + runoff_ratio, 
              data = camels_train) %>%
  step_log(q_mean, base = 10) %>%  
  step_normalize(all_numeric_predictors()) %>%  
  step_impute_median(all_numeric_predictors()) %>%  
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%  
  step_nzv(all_predictors())  

rec <- prep(rec, training = camels_train)

train_prepped <- bake(rec, new_data = camels_train)
head(train_prepped)
# A tibble: 6 × 8
  p_mean aridity elev_mean slope_mean geol_permeability baseflow_index
   <dbl>   <dbl>     <dbl>      <dbl>             <dbl>          <dbl>
1  0.251 -0.0103    -0.921     -0.939             2.06          0.380 
2  0.292 -0.169     -0.911     -0.941             1.77          0.569 
3  0.303 -0.0751    -0.914     -0.931             1.38          0.482 
4  0.434 -0.107     -0.925     -0.955             0.765         0.0394
5 -1.37   0.810     -0.326     -0.880             0.302        -0.246 
6 -1.34   0.696     -0.342     -0.886             0.245         0.0675
# ℹ 2 more variables: runoff_ratio <dbl>, q_mean <dbl>

define 3 models

library(parsnip)

rf_model <- rand_forest(mtry = tune(), trees = 500, min_n = tune()) %>%
  set_engine("ranger") %>%
  set_mode("regression")
xgb_model <- boost_tree(trees = tune(), tree_depth = tune(), learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("regression")
library(baguette)

nn_model <- bag_mlp(hidden_units = tune(), penalty = tune()) %>%
  set_engine("nnet") %>%
  set_mode("regression")

workflow set

rec <- recipe(q_mean ~ p_mean + aridity + elev_mean + slope_mean + 
                 geol_permeability + baseflow_index + runoff_ratio, 
              data = camels_train) %>%
  step_log(q_mean, base = 10) %>%  
  step_normalize(all_numeric_predictors()) %>%  
  step_impute_median(all_numeric_predictors()) %>%  
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%  
  step_nzv(all_predictors())  
# Random Forest 
rf_model <- rand_forest(mtry = tune(), trees = 500, min_n = tune()) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# XGBoost model 
xgb_model <- boost_tree(trees = tune(), tree_depth = tune(), learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# Neural Network 
nn_model <- bag_mlp(hidden_units = tune(), penalty = tune()) %>%
  set_engine("nnet") %>%
  set_mode("regression")
model_set <- workflow_set(
  preproc = list("recipe" = rec),
  models = list(
    "Random Forest" = rf_model, 
    "XGBoost" = xgb_model, 
    "Neural Network" = nn_model
  )
)
camels_train <- camels_train %>%
  filter(!is.na(q_mean))
model_set <- workflow_set(
  preproc = list("recipe" = rec),
  models = list(
    "Random Forest" = rf_model, 
    "XGBoost" = xgb_model, 
    "Neural Network" = nn_model
  )
)

model_fit <- model_set %>%
  workflow_map(
    "tune_grid",  
    resamples = cv_folds,
    metrics = metric_set(rmse, rsq, mae),
    grid = 10,  
    verbose = TRUE
  )
i 1 of 3 tuning:     recipe_Random Forest
i Creating pre-processing data to finalize unknown parameter: mtry
→ A | error:   Error: Missing data in dependent variable.
There were issues with some computations   A: x1
There were issues with some computations   A: x31
There were issues with some computations   A: x90
✔ 1 of 3 tuning:     recipe_Random Forest (3s)
i 2 of 3 tuning:     recipe_XGBoost
→ A | error:   [14:35:16] src/data/data.cc:461: Check failed: valid: Label contains NaN, infinity or a value too large.
               Stack trace:
                 [bt] (0) 1   xgboost.so                          0x000000010b315c7c dmlc::LogMessageFatal::~LogMessageFatal() + 124
                 [bt] (1) 2   xgboost.so                          0x000000010b3ad460 xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 3616
                 [bt] (2) 3   xgboost.so                          0x000000010b3ae6e8 xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, char const*, void const*, xgboost::DataType, unsigned long) + 168
                 [bt] (3) 4   xgboost.so                          0x000000010b4cc184 XGDMatrixSetFloatInfo + 132
                 [bt] (4) 5   xgboost.so                          0x000000010b310eec XGDMatrixSetInfo_R + 620
                 [bt] (5) 6   libR.dylib                          0x00000001032409b4 R_doDotCall + 1588
                 [bt] (6) 7   libR.dylib                          0x000000010329ccfc bcEval_loop + 128060
                 [bt] (7) 
→ B | error:   [14:35:19] src/data/data.cc:461: Check failed: valid: Label contains NaN, infinity or a value too large.
               Stack trace:
                 [bt] (0) 1   xgboost.so                          0x000000010b315c7c dmlc::LogMessageFatal::~LogMessageFatal() + 124
                 [bt] (1) 2   xgboost.so                          0x000000010b3ad460 xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 3616
                 [bt] (2) 3   xgboost.so                          0x000000010b3ae6e8 xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, char const*, void const*, xgboost::DataType, unsigned long) + 168
                 [bt] (3) 4   xgboost.so                          0x000000010b4cc184 XGDMatrixSetFloatInfo + 132
                 [bt] (4) 5   xgboost.so                          0x000000010b310eec XGDMatrixSetInfo_R + 620
                 [bt] (5) 6   libR.dylib                          0x00000001032409b4 R_doDotCall + 1588
                 [bt] (6) 7   libR.dylib                          0x000000010329ccfc bcEval_loop + 128060
                 [bt] (7) 
There were issues with some computations   A: x30   B: x1
→ C | error:   [14:35:20] src/data/data.cc:461: Check failed: valid: Label contains NaN, infinity or a value too large.
               Stack trace:
                 [bt] (0) 1   xgboost.so                          0x000000010b315c7c dmlc::LogMessageFatal::~LogMessageFatal() + 124
                 [bt] (1) 2   xgboost.so                          0x000000010b3ad460 xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 3616
                 [bt] (2) 3   xgboost.so                          0x000000010b3ae6e8 xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, char const*, void const*, xgboost::DataType, unsigned long) + 168
                 [bt] (3) 4   xgboost.so                          0x000000010b4cc184 XGDMatrixSetFloatInfo + 132
                 [bt] (4) 5   xgboost.so                          0x000000010b310eec XGDMatrixSetInfo_R + 620
                 [bt] (5) 6   libR.dylib                          0x00000001032409b4 R_doDotCall + 1588
                 [bt] (6) 7   libR.dylib                          0x000000010329ccfc bcEval_loop + 128060
                 [bt] (7) 
There were issues with some computations   A: x30   B: x1
There were issues with some computations   A: x30   B: x20   C: x32
There were issues with some computations   A: x30   B: x20   C: x40

✔ 2 of 3 tuning:     recipe_XGBoost (4.8s)
i 3 of 3 tuning:     recipe_Neural Network
✔ 3 of 3 tuning:     recipe_Neural Network (22.6s)
collect_metrics(model_fit)
# A tibble: 90 × 9
   wflow_id        .config preproc model .metric .estimator   mean     n std_err
   <chr>           <chr>   <chr>   <chr> <chr>   <chr>       <dbl> <int>   <dbl>
 1 recipe_Random … Prepro… recipe  rand… mae     standard   0.0841     1      NA
 2 recipe_Random … Prepro… recipe  rand… rmse    standard   0.141      1      NA
 3 recipe_Random … Prepro… recipe  rand… rsq     standard   0.945      1      NA
 4 recipe_Random … Prepro… recipe  rand… mae     standard   0.102      1      NA
 5 recipe_Random … Prepro… recipe  rand… rmse    standard   0.164      1      NA
 6 recipe_Random … Prepro… recipe  rand… rsq     standard   0.928      1      NA
 7 recipe_Random … Prepro… recipe  rand… mae     standard   0.0445     1      NA
 8 recipe_Random … Prepro… recipe  rand… rmse    standard   0.0832     1      NA
 9 recipe_Random … Prepro… recipe  rand… rsq     standard   0.979      1      NA
10 recipe_Random … Prepro… recipe  rand… mae     standard   0.0585     1      NA
# ℹ 80 more rows
autoplot(model_fit)

rank_results(model_fit)
# A tibble: 90 × 9
   wflow_id       .config .metric    mean std_err     n preprocessor model  rank
   <chr>          <chr>   <chr>     <dbl>   <dbl> <int> <chr>        <chr> <int>
 1 recipe_Neural… Prepro… mae     0.00733 8.26e-4    10 recipe       bag_…     1
 2 recipe_Neural… Prepro… rmse    0.0198  4.57e-3    10 recipe       bag_…     1
 3 recipe_Neural… Prepro… rsq     0.998   6.53e-4    10 recipe       bag_…     1
 4 recipe_Neural… Prepro… mae     0.00954 8.39e-4    10 recipe       bag_…     2
 5 recipe_Neural… Prepro… rmse    0.0214  5.21e-3    10 recipe       bag_…     2
 6 recipe_Neural… Prepro… rsq     0.998   8.07e-4    10 recipe       bag_…     2
 7 recipe_Neural… Prepro… mae     0.00920 8.30e-4    10 recipe       bag_…     3
 8 recipe_Neural… Prepro… rmse    0.0216  4.03e-3    10 recipe       bag_…     3
 9 recipe_Neural… Prepro… rsq     0.998   5.53e-4    10 recipe       bag_…     3
10 recipe_Neural… Prepro… mae     0.00913 1.25e-3    10 recipe       bag_…     4
# ℹ 80 more rows

Evaluation

library(tune)

ranked <- rank_results(model_fit)
print(ranked)
# A tibble: 90 × 9
   wflow_id       .config .metric    mean std_err     n preprocessor model  rank
   <chr>          <chr>   <chr>     <dbl>   <dbl> <int> <chr>        <chr> <int>
 1 recipe_Neural… Prepro… mae     0.00733 8.26e-4    10 recipe       bag_…     1
 2 recipe_Neural… Prepro… rmse    0.0198  4.57e-3    10 recipe       bag_…     1
 3 recipe_Neural… Prepro… rsq     0.998   6.53e-4    10 recipe       bag_…     1
 4 recipe_Neural… Prepro… mae     0.00954 8.39e-4    10 recipe       bag_…     2
 5 recipe_Neural… Prepro… rmse    0.0214  5.21e-3    10 recipe       bag_…     2
 6 recipe_Neural… Prepro… rsq     0.998   8.07e-4    10 recipe       bag_…     2
 7 recipe_Neural… Prepro… mae     0.00920 8.30e-4    10 recipe       bag_…     3
 8 recipe_Neural… Prepro… rmse    0.0216  4.03e-3    10 recipe       bag_…     3
 9 recipe_Neural… Prepro… rsq     0.998   5.53e-4    10 recipe       bag_…     3
10 recipe_Neural… Prepro… mae     0.00913 1.25e-3    10 recipe       bag_…     4
# ℹ 80 more rows
autoplot(model_fit)

I think XGBoost probably is the best/most accurate one. It has the highest R squared value so least amount of variability.

Extract and Evaluate

# 1. Create recipe
rec <- recipe(q_mean ~ p_mean + aridity + elev_mean + slope_mean + 
                geol_permeability + baseflow_index + runoff_ratio,
              data = camels_train) %>%
  step_log(q_mean, base = 10) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%
  step_nzv(all_predictors())
# 2. Get best XGBoost model from tuning results
best_xgb <- model_fit %>%
  extract_workflow_set_result("recipe_XGBoost") %>%
  select_best(metric = "rsq")

final_xgb <- finalize_model(xgb_model, best_xgb)
# 3. Build and fit the workflow with the final model and recipe
final_xgb_wf <- workflow() %>%
  add_model(final_xgb) %>%
  add_recipe(rec)

final_xgb_fit <- final_xgb_wf %>% fit(data = camels_train)
# 4. Clean test set
camels_test_clean <- camels_test %>%
  filter(!is.na(q_mean), q_mean > 0)
xgb_fit_object <- extract_fit_parsnip(final_xgb_fit)
rec_prepped <- prep(rec, training = camels_train)

test_baked <- bake(rec_prepped, new_data = camels_test_clean)
test_preds <- predict(xgb_fit_object, new_data = test_baked)
xgb_preds <- bind_cols(
  test_baked,
  test_preds,
  q_mean_obs = camels_test_clean$q_mean
) %>%
  mutate(q_mean_pred = 10^.pred)
ggplot(xgb_preds, aes(x = q_mean_pred, y = q_mean_obs)) +
  geom_point(alpha = 0.7, color = "mediumorchid") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    title = "XGBoost — Observed vs Predicted Streamflow",
    x = "Predicted q_mean",
    y = "Observed q_mean"
  ) +
  theme_minimal()

The XGBoost model worked well on the test data, it predicted values that are very similar to the observed streamflow values. The log–log scatterplot shows a cluster of points around the 1:1 line which shows accurate predictions. There is not a lot of spread, even at the low and high extremes which means the model generalizes well and shows key hydrological patterns. This supports the high R² score shown during cross-validation and reinforces that XGBoost as a good choice for modeling streamflow in this dataset.